### Replication File for 
### "Does Ideology Influence Hiring in China? Evidence from Two Randomized Experiments"
### Jennifer Pan and Tongtong Zhang

### This file: codes that replicate all tables, figures, and statistics in the main paper

######################################################################################
## Section 2.1 Treatments
######################################################################################
## Load Data and Packages
rm(list=ls())  
# Please set the working directory to the folder where the ReadMe.txt is located.
#setwd("")
d<-read.csv("Data/Pretest2_ManipulationCheck.csv",header=TRUE, stringsAsFactors = FALSE,sep=",") 
#install.packages("lmtest")
#install.packages("sandwich")
suppressMessages(library(lmtest))
## Function to compute var-cov matrix using clustered robust standard errors
vcovCluster <- function(
  model,
  cluster
)
{
  require(sandwich)
  require(lmtest)
  if(nrow(model.matrix(model))!=length(cluster)){
    stop("check your data: cluster variable has different N than model")
  }
  M <- length(unique(cluster))
  N <- length(cluster)           
  K <- model$rank   
  if(M<50){
    warning("Fewer than 50 clusters, variances may be unreliable (could try block bootstrap instead).")
  }
  dfc <- (M/(M - 1)) * ((N - 1)/(N - K))
  uj  <- apply(estfun(model), 2, function(x) tapply(x, cluster, sum));
  rcse.cov <- dfc * sandwich(model, meat = crossprod(uj)/N)
  return(rcse.cov)
}

###############################
####### Figure 1 #############
###############################
## Create a blank dataset to save the effect of treatment on each feature of applicants
results<-matrix(NA, nrow=18,ncol=9) 
results<-data.frame(results)
colnames(results)<-c("feature","w_p","w_se","w_low","w_upper","s_p","s_se","s_low","s_upper")

## 18 features we test
results[,1]<-c("loyalty", "political_active","creativity","conservative",
               "critical","bookish","patriotic","obedient","collaborative","dedicated",
               "risk_taking","independent","strategic","problem_solving","connections",
               "efficiency","leadership","merit")

## The effect of treatment on each feature
# Feature 1: Politicaly loyalty
d$Y<-d$allegiance
m_extra_allegiance<-lm(Y~chnsocial+western,data=d)
marx_extra_allegiance<-coeftest(m_extra_allegiance,vcov=vcovCluster(m_extra_allegiance,cluster=d$respondent[complete.cases(d$Y)]))[2:3,c(1,2,4)]
results[results$feature=="loyalty",c("w_p","w_se")]<-marx_extra_allegiance[2,1:2] 
results[results$feature=="loyalty",c("s_p","s_se")]<-marx_extra_allegiance[1,1:2]

# Feature 2: Politically active
d$Y<-as.numeric(d$political.active) 
m_extra_political_active<-lm(Y~chnsocial+western,data=d) 
marx_extra_political_active<-coeftest(m_extra_political_active,vcov=vcovCluster(m_extra_political_active,cluster=d$respondent[complete.cases(d$Y)]))[2:3,c(1,2,4)]
results[results$feature=="political_active",c("w_p","w_se")]<-marx_extra_political_active[2,1:2] 
results[results$feature=="political_active",c("s_p","s_se")]<-marx_extra_political_active[1,1:2]

# Feature 3: Creativity
d$Y<-as.numeric(d$creativity)  
m_extra_creativity<-lm(Y~chnsocial+western,data=d) 
marx_extra_creativity<-coeftest(m_extra_creativity,vcov=vcovCluster(m_extra_creativity,cluster=d$respondent[complete.cases(d$Y)]))[2:3,c(1,2,4)]
results[results$feature=="creativity",c("w_p","w_se")]<-marx_extra_creativity[2,1:2] 
results[results$feature=="creativity",c("s_p","s_se")]<-marx_extra_creativity[1,1:2]

# Feature 4: Conservative
d$Y<-as.numeric(d$conservative) 
m_extra_conservative<-lm(Y~chnsocial+western,data=d) 
marx_extra_conservative<-coeftest(m_extra_conservative,vcov=vcovCluster(m_extra_conservative,cluster=d$respondent[complete.cases(d$Y)]))[2:3,c(1,2,4)]
results[results$feature=="conservative",c("w_p","w_se")]<-marx_extra_conservative[2,1:2]
results[results$feature=="conservative",c("s_p","s_se")]<-marx_extra_conservative[1,1:2]

# Feature 5: Critical thinking
d$Y<-as.numeric(d$critical)  
m_extra_critical<-lm(Y~chnsocial+western,data=d) 
marx_extra_critical<-coeftest(m_extra_critical,vcov=vcovCluster(m_extra_critical,cluster=d$respondent[complete.cases(d$Y)]))[2:3,c(1,2,4)]
results[results$feature=="critical",c("w_p","w_se")]<-marx_extra_critical[2,1:2]
results[results$feature=="critical",c("s_p","s_se")]<-marx_extra_critical[1,1:2]

# Feature 6: Bookish
d$Y<-as.numeric(d$bookish)
m_extra_bookish<-lm(Y~chnsocial+western,data=d)
marx_extra_bookish<-coeftest(m_extra_bookish,vcov=vcovCluster(m_extra_bookish,cluster=d$respondent[complete.cases(d$Y)]))[2:3,c(1,2,4)]
results[results$feature=="bookish",c("w_p","w_se")]<-marx_extra_bookish[2,1:2]
results[results$feature=="bookish",c("s_p","s_se")]<-marx_extra_bookish[1,1:2]

# Feature 7: Patriotic
d$Y<-as.numeric(d$patriotic)  
m_extra_patriotic<-lm(Y~chnsocial+western,data=d) 
marx_extra_patriotic<-coeftest(m_extra_patriotic,vcov=vcovCluster(m_extra_patriotic,cluster=d$respondent[complete.cases(d$Y)]))[2:3,c(1,2,4)]
results[results$feature=="patriotic",c("w_p","w_se")]<-marx_extra_patriotic[2,1:2]
results[results$feature=="patriotic",c("s_p","s_se")]<-marx_extra_patriotic[1,1:2]

# Feature 8: Obedient
d$Y<-as.numeric(d$obedient)  
m_extra_obedient<-lm(Y~chnsocial+western,data=d)
marx_extra_obedient<-coeftest(m_extra_obedient,vcov=vcovCluster(m_extra_obedient,cluster=d$respondent[complete.cases(d$Y)]))[2:3,c(1,2,4)]
results[results$feature=="obedient",c("w_p","w_se")]<-marx_extra_obedient[2,1:2]
results[results$feature=="obedient",c("s_p","s_se")]<-marx_extra_obedient[1,1:2]

# Feature 9: Collaborative
d$Y<-as.numeric(d$collaborative) 
m_extra_collaborative<-lm(Y~chnsocial+western,data=d)
marx_extra_collaborative<-coeftest(m_extra_collaborative,vcov=vcovCluster(m_extra_collaborative,cluster=d$respondent[complete.cases(d$Y)]))[2:3,c(1,2,4)]
results[results$feature=="collaborative",c("w_p","w_se")]<-marx_extra_collaborative[2,1:2]
results[results$feature=="collaborative",c("s_p","s_se")]<-marx_extra_collaborative[1,1:2]

# Feature 10: Dedicated
d$Y<-as.numeric(d$dedicated)  
m_extra_dedicated<-lm(Y~chnsocial+western,data=d)
marx_extra_dedicated<-coeftest(m_extra_dedicated,vcov=vcovCluster(m_extra_dedicated,cluster=d$respondent[complete.cases(d$Y)]))[2:3,c(1,2,4)]
results[results$feature=="dedicated",c("w_p","w_se")]<-marx_extra_dedicated[2,1:2]
results[results$feature=="dedicated",c("s_p","s_se")]<-marx_extra_dedicated[1,1:2]

# Feature 11: Risk_taking
d$Y<-as.numeric(d$risk.acceptance)
m_extra_risk_taking<-lm(Y~chnsocial+western,data=d)
marx_extra_risk_taking<-coeftest(m_extra_risk_taking,vcov=vcovCluster(m_extra_risk_taking,cluster=d$respondent[complete.cases(d$Y)]))[2:3,c(1,2,4)]
results[results$feature=="risk_taking",c("w_p","w_se")]<-marx_extra_risk_taking[2,1:2] 
results[results$feature=="risk_taking",c("s_p","s_se")]<-marx_extra_risk_taking[1,1:2]

# Feature 12: Independence
d$Y<-as.numeric(d$independent) 
m_extra_independent<-lm(Y~chnsocial+western,data=d)
marx_extra_independent<-coeftest(m_extra_independent,vcov=vcovCluster(m_extra_independent,cluster=d$respondent[complete.cases(d$Y)]))[2:3,c(1,2,4)]
results[results$feature=="independent",c("w_p","w_se")]<-marx_extra_independent[2,1:2]
results[results$feature=="independent",c("s_p","s_se")]<-marx_extra_independent[1,1:2] 

# Feature 13: Strategic
d$Y<-as.numeric(d$strategic)  
m_extra_strategic<-lm(Y~chnsocial+western,data=d)
marx_extra_strategic<-coeftest(m_extra_strategic,vcov=vcovCluster(m_extra_strategic,cluster=d$respondent[complete.cases(d$Y)]))[2:3,c(1,2,4)]
results[results$feature=="strategic",c("w_p","w_se")]<-marx_extra_strategic[2,1:2] 
results[results$feature=="strategic",c("s_p","s_se")]<-marx_extra_strategic[1,1:2]

# Feature 14: Problem Solving
d$Y<-as.numeric(d$problem.solving) 
m_extra_problem_solving<-lm(Y~chnsocial+western,data=d)
marx_extra_problem_solving<-coeftest(m_extra_problem_solving,vcov=vcovCluster(m_extra_problem_solving,cluster=d$respondent[complete.cases(d$Y)]))[2:3,c(1,2,4)]
results[results$feature=="problem_solving",c("w_p","w_se")]<-marx_extra_problem_solving[2,1:2] 
results[results$feature=="problem_solving",c("s_p","s_se")]<-marx_extra_problem_solving[1,1:2]

# Feature 15: Social connections
d$Y<-as.numeric(d$connections)  
m_extra_connections<-lm(Y~chnsocial+western,data=d)
marx_extra_connections<-coeftest(m_extra_connections,vcov=vcovCluster(m_extra_connections,cluster=d$respondent[complete.cases(d$Y)]))[2:3,c(1,2,4)]
results[results$feature=="connections",c("w_p","w_se")]<-marx_extra_connections[2,1:2]
results[results$feature=="connections",c("s_p","s_se")]<-marx_extra_connections[1,1:2]

# Feature 16: Work Efficiency
d$Y<-as.numeric(d$productive)  
m_extra_productive<-lm(Y~chnsocial+western,data=d)
marx_extra_productive<-coeftest(m_extra_productive,vcov=vcovCluster(m_extra_productive,cluster=d$respondent[complete.cases(d$Y)]))[2:3,c(1,2,4)]
results[results$feature=="efficiency",c("w_p","w_se")]<-marx_extra_productive[2,1:2] 
results[results$feature=="efficiency",c("s_p","s_se")]<-marx_extra_productive[1,1:2]

# Feature 17: Leadership
d$Y<-as.numeric(d$leadership)  
m_extra_leadership<-lm(Y~chnsocial+western,data=d)
marx_extra_leadership<-coeftest(m_extra_leadership,vcov=vcovCluster(m_extra_leadership,cluster=d$respondent[complete.cases(d$Y)]))[2:3,c(1,2,4)]
results[results$feature=="leadership",c("w_p","w_se")]<-marx_extra_leadership[2,1:2] 
results[results$feature=="leadership",c("s_p","s_se")]<-marx_extra_leadership[1,1:2]

# Feature 18: Academic merit
d$Y<-as.numeric(d$merit) 
m_extra_merit<-lm(Y~chnsocial+western,data=d)
marx_extra_merit<-coeftest(m_extra_merit,vcov=vcovCluster(m_extra_merit,cluster=d$respondent[complete.cases(d$Y)]))[2:3,c(1,2,4)]
results[results$feature=="merit",c("w_p","w_se")]<-marx_extra_merit[2,1:2] 
results[results$feature=="merit",c("s_p","s_se")]<-marx_extra_merit[1,1:2]

## Calculate the lower and upper bounds of 95% confidence interval for treatment effect on each feature
results$w_low <-results$w_p - 1.96*results$w_se
results$w_upper <-results$w_p + 1.96*results$w_se
results$s_low <-results$s_p - 1.96*results$s_se
results$s_upper <-results$s_p + 1.96*results$s_se
min(c(results$w_low, results$s_low))
max(c(results$w_upper, results$s_upper))
# Jitter the label on the zero effects attributes
results[results$feature=="connections",c("w_p","s_p")]<-c(0.0770286,0)
results[results$feature=="leadership",c("w_p","s_p")]<-c(0.077,0)
results[results$feature=="merit",c("w_p","s_p")]<-c(0.054,0.129)

## Plotting Figure 1
pdf("Log/Figure1.pdf",width=10,height=8)
par(mar=c(4, 11, 1, 1) + 0.1)
y<-c(43.9,seq(from=33.9,to=0.9,by=-2)) #all y values for effects to be plotted
plot(results$w_p, y, xlim=c(-2.5,2.5), ylim=c(-1.7,44.5),main="", yaxt="n", ylab="", xlab="", pch="w", cex=1.5, cex.axis=1.5, cex.lab=1.5)
abline(v=0,lty=2)
for(i in 1:18){
  segments(results[i,4],y[i],results[i,5],y[i], lwd=2, lty=1)
}
y_s<-c(44.1,seq(from=34.1,to=1.1,by=-2)) #all y values for effects to be plotted
points(results$s_p, y_s, pch="s", cex=1.5, cex.axis=1, cex.lab=1)
for(i in 1:18){
  segments(results[i,8],y_s[i],results[i,9],y_s[i], lwd=2, lty=3)
}
mtext("Rating of Applicant Compared to Comic Book Club", side=1, line=2.5, at=0, cex=1.5)
arrows(results$w_p[1], 41.3, results$w_p[1], 42.6, length=0.05, lwd=1)
suppressWarnings(text(results$w_p[1], 38.5, expression(italic('Western Political\nPhilosophy Study Group')), cex=1))
arrows(results$s_p[1], 41.3, results$s_p[1], 42.6, length=0.05, lwd=1)
suppressWarnings(text(results$s_p[1], 38.5, expression(italic('Socialism with Chinese\nCharacteristics Study Group')), cex=1))
suppressWarnings(text(-2.7, -1.8, pos=4, expression(italic('Lower rating than\nComic Book Club')), cex=1))
suppressWarnings(text(2.7, -1.8, pos=2, expression(italic('Higher rating than\nComic Book Club')), cex=1))
features <- c("Political loyalty","Politically active","Creative",
              "Conservative","Critical thinking","Bookish","Patriotic","Obedient",
              "Collaborative","Dedicated","Risk taking","Independent",
              "Strategic","Problem solving","Social connections","Work efficiency",
              "Leadership","Academic merit")
par(las=1)
for (i in 1:18){
  mtext(features[i],side=2,line=0.1,at=y[i],cex=1.5)
}
dev.off()

######################################################################################
## Section 3 No Reward for Conformity, Penalty for Non-conformity
######################################################################################
## Load Data and Packages
rm(list=ls())  
d<-read.csv("Data/ResumeAudit.csv",header=TRUE, stringsAsFactors = FALSE,sep=",") 
colnames(d)
#install.packages("plm")
#install.packages("lmtest")
#install.packages("mfx")
#install.packages("bife")
#install.packages("multiwayvcov")
#install.packages("stargazer")
suppressMessages(library(plm))
suppressMessages(library(lmtest))
suppressMessages(library(mfx))
suppressMessages(library(bife))
suppressMessages(library(multiwayvcov))
suppressMessages(library(stargazer))

###############################
####### Table 2 #############
###############################
## Callback rates and observations
results<-matrix(NA, nrow=4,ncol=3)
colnames(results)<-c("","Callback rate","Number of resumes")
results[,1]<-c("Overall","Non-political",
               "Conformist","Non-conformist")
# Overall
results[1,2]<-round(mean(d$callback),digits=3)
results[1,3]<-dim(d)[1] #19,221 unique resumes
# Apolitical control
results[2,2]<-round(mean(subset(d, allegiance=="comic")$callback),digits=3)
results[2,3]<-length(subset(d, allegiance=="comic")$callback)
# Conformity treatment
results[3,2]<-round(mean(subset(d, allegiance=="socialism")$callback),digits=3)
results[3,3]<-length(subset(d, allegiance=="socialism")$callback)
# Non-conformity treatment
results[4,2]<-round(mean(subset(d, allegiance=="western")$callback),digits=3)
results[4,3]<-length(subset(d, allegiance=="western")$callback)
results<-stargazer(results,title="Table 2: Summary Statistics for Callbacks in Audit Experiment",
                   type="text",summary=FALSE)
write.table(results, file="Log/Table2.txt", row.names = FALSE)


rm(results)
###############################
####### Table 3 #############
###############################
# Column 1: No controls 
model1<-logitmfx(callback ~ conformity + non_conformity, data=d, atmean=F) #average marginal effects
coef1<-model1$mfxest[,1]
se1<-model1$mfxest[,2]

# Column 2: With vacancy fixed-effects
d$job_identifier<-as.factor(d$job_identifier) 
lgt.c2 <- bife(callback ~ conformity + non_conformity | job_identifier, data=d, model="logit")
lgt.c2.ame <- get_APEs(lgt.c2)
model2<-coeftest(lgt.c2.ame)
coef2<-model2[,1]
se2<-model2[,2]

# Column 3: With controls
covars <- c("male","high_tier","ccp", "cs", "ee", "econ", "med", "soe", "private", "foreign",
            "javascript", "linux", "php", "python","sql", 
            "running400", "running800", "highjump", "longjump", "shotput", "swim",
            "badminton", "biking", "rollerskate", "music","UJobSameProvince","university_east",
            "university_west")
X<-as.matrix(d[,..covars])
X_demean <- X - predict(lm(X~rep(1,dim(d)[1]))) #demean each control
model3<-logitmfx(callback ~ conformity + non_conformity + X_demean + (conformity + non_conformity)*X_demean, data=d, atmean=F)
coef3<-model3$mfxest[,1]
se3<-model3$mfxest[,2]

## Save the table
results<-stargazer(model1$fit, model2, model3$fit,
                   coef=list(coef1,coef2,coef3),
                   se=list(se1,se2,se3), omit=c("X_", "Constant"),
                   title=c("Table 3"), 
                   align=TRUE, digits=3, type="text",
                  omit.stat=c("all"),
                   no.space = T, dep.var.labels.include=FALSE, model.names = F,
                   add.lines = list(c("Observation",rep("19221",3)))) 
write.table(results, file="Log/Table3.txt", row.names = FALSE)


rm(results)
###################################################################
####### Section 3.1 Rationale (statistics in text on p.12) #############
###################################################################
data<-read.csv("Data/HRSurvey.csv", sep=",", header=TRUE, stringsAsFactors = FALSE)
dim(data) #506 respondents x 3 resume/respondent = 1518 observations
## Reported callbacks on conformity treatment
table(data$Y[data$conformity==1], exclude=NULL)
prop.table(table(data$Y[data$conformity==1]))
## Reported callbacks on non-conformity treatment
table(data$Y[data$non_conformity==1], exclude=NULL)
prop.table(table(data$Y[data$non_conformity==1]))

## Effect of non-conformity vs. conformity in hiring manager survey
## Below we use logit, OLS, and OLS weighted by ownership sector to estimate the effect
## of non-conformity vs. conformity. All these 3 analyses produce the same result:
## 7% penalty on non-conformity when compared to conformity.
# Logit regression
logit.hr<-logitmfx(Y ~ apolitical + non_conformity, data=data, atmean=F) #base group: conformity
logit.hr$mfxest
conformity_callback<-mean(subset(data, resume_allegiance=="Socialism")$Y, na.rm=T) #callback rate in the base group
logit.hr$mfxest[2,1]/conformity_callback*100 #effect size: 7% decrease

# OLS regression
ols.hr <- lm(Y ~ apolitical+non_conformity, data = data) 
vcovCL.hr<-cluster.vcov(ols.hr, data$ResponseId) #robust s.e. clustered on respondent
coeftest(ols.hr, vcov = vcovCL.hr)
#effect size: 7% decrease
coeftest(ols.hr, vcov = vcovCL.hr)[3,1]/conformity_callback*100

# OLS regression (weighted by ownership sector)
prop.table(table(data$sector[!is.na(data$Y)]))
data$weight_Y<-NA # Weight for each obs = audit% / hr_survey%
data$weight_Y[data$sector=="foreign"]<-0.211/0.217
data$weight_Y[data$sector=="private"]<-0.553/0.518
data$weight_Y[data$sector=="public"]<-0.091/0.126
data$weight_Y[data$sector=="soe"]<-0.145/0.138
ols.hr.weight <- lm(Y ~ apolitical + non_conformity, weights=data$weight_Y,  data = data) 
vcovCL.hr.weight<-cluster.vcov(ols.hr.weight, data$ResponseId) 
coeftest(ols.hr.weight, vcov = vcovCL.hr.weight)
#effect size: 7% decrease
coeftest(ols.hr.weight, vcov = vcovCL.hr.weight)[3,1]/conformity_callback*100 

## Remove everything about the hiring manager survey
rm(data,logit.hr,conformity_callback,ols.hr,vcovCL.hr,ols.hr.weight,vcovCL.hr.weight)


######################################################################################
## Section 4 Heterogeneity in Penalty on Non-conformity
######################################################################################

###############################
####### Figure 2 #############
###############################
results<-matrix(NA, nrow=4,ncol=5)
results<-data.frame(results)
colnames(results)<-c("feature","estimate","se","low","upper")
results[,1]<-c("public_institution", "SOE","Private","Foreign")

# Public institutions vs. all other ownership sectors
ols.pool.nc1 <- lm(callback ~ non_conformity*public, data = d) 
vcovCL.pool.nc1<-cluster.vcov(ols.pool.nc1, d$job_identifier) #robust s.e. clustered on vacancy level
results[results$feature=="public_institution",c("estimate","se")]<-coeftest(ols.pool.nc1, vcov = vcovCL.pool.nc1)[4,1:2]
# SOE vs. all other ownership sectors
ols.pool.nc1 <- lm(callback ~ non_conformity*soe, data = d) 
vcovCL.pool.nc1<-cluster.vcov(ols.pool.nc1, d$job_identifier)
results[results$feature=="SOE",c("estimate","se")]<-coeftest(ols.pool.nc1, vcov = vcovCL.pool.nc1)[4,1:2]
# Private vs. all other ownership sectors
ols.pool.nc1 <- lm(callback ~ non_conformity*private, data = d) 
vcovCL.pool.nc1<-cluster.vcov(ols.pool.nc1, d$job_identifier) #robust s.e. clustered on the job level
results[results$feature=="Private",c("estimate","se")]<-coeftest(ols.pool.nc1, vcov = vcovCL.pool.nc1)[4,1:2]
# Foreign vs. all other ownership sectors
ols.pool.nc1 <- lm(callback ~ non_conformity*foreign, data = d) 
vcovCL.pool.nc1<-cluster.vcov(ols.pool.nc1, d$job_identifier) #robust s.e. clustered on the job level
results[results$feature=="Foreign",c("estimate","se")]<-coeftest(ols.pool.nc1, vcov = vcovCL.pool.nc1)[4,1:2]

# Confidence interval of diff in effects of non-conformity
results$low <-results$estimate - 1.96*results$se
results$upper <-results$estimate + 1.96*results$se
results[,c(2,4,5)]<-(-1)*results[,c(2,4,5)]
max(results$low)
min(results$upper)

## Plotting Figure 2
pdf("Log/Figure2.pdf",width=10,height=8)
par(mar=c(4, 11, 1, 5) + 0.1)
y<-c(2,1.5,1,0.5)
plot(results$estimate, y, xlim=c(-0.04,0.04), ylim=c(0,2.2),main="", yaxt="n", ylab="", 
     xlab="", pch=16, cex=1.5, cex.axis=1.5, cex.lab=1.5)
abline(v=0,lty=2)
for(i in 1:4){
  segments(results[i,4],y[i],results[i,5],y[i], lwd=2, lty=1)
}
mtext("Penalty on Non-conformity Compared to Rest of Sample", side=1, line=2.5, at=0, cex=1.5)
suppressWarnings(text(-0.044, -0.05, pos=4, expression(italic('Less penalty on\nnon-conformity')), cex=1.5))
suppressWarnings(text(0.044, -0.05, pos=2, expression(italic('More penalty on\nnon-conformity')), cex=1.5))
features<-c("Public institution", "SOE","Private","Foreign")
par(las=1)
for (i in 1:4){
  mtext(features[i],side=2,line=1,at=y[i],adj=1,cex=1.5)
}
dev.off()


###############################
####### Figure 3 #############
###############################
### Structural Topic Modeling (STM) on Firm Names
# Per IRB requirement, we cannot disclose the names of firms involved in this study.
# So, below we provided and commented out the R codes we used to conduct the STM analysis
# and produce Figure 3 using the saved output of the STM directly.
# All firm names in the saved output of STM have been replaced with "NA".

## Step 1: Pre-processing firm names
#rm(list=ls())
#pkgs <- c("stm", "tm")
#sapply(pkgs, require, character.only=TRUE)
#vacancy<-read.csv("vacancy_raw.csv",header=TRUE, stringsAsFactors = FALSE,sep=",")
#colnames(vacancy)
#vacancy$Employers_seg<-gsub("[A-Za-z]", "", vacancy$Employers_seg)
#vacancy$Employers_seg<-gsub("[0-9]", "", vacancy$Employers_seg)
#c_stopwords<-c("\n", "delimiter","是","的","吗","不","请","于","了","啊","在","微博",
#               "微","博","我","我们","你","你们","他","他们","她","她们","呢","您",
#               "一","个","却","几","能否","什么","有限","公司","集团","分公司","责任","股份","企业") 
#temp_employers <- textProcessor(documents=vacancy$Employers_seg, 
#                                metadata=vacancy,
#                                removestopwords=TRUE, removenumbers=TRUE,
#                                removepunctuation=TRUE, ucp=TRUE, 
#                                customstopwords=c_stopwords,
#                                custompunctuation=c("：", "，", "。","#","@","【", "】", 
#                                                   "《","》","、","；","`","-","——"),  
#                               wordLengths=c(1,6))
#length(temp_employers$vocab) #6400 terms
#length(temp_employers$documents) #6296 firm names
#plotRemoved(temp_employers$documents, lower.thresh = seq(1, 20, by = 1))
## Prep documents (keep terms appearing in [1, 99%] of all firm names)
#bout_employers <- prepDocuments(temp_employers$documents, temp_employers$vocab, temp_employers$meta, 
#                               lower.thresh=1, upper.thres=6234)
#docs_employers <- out_employers$documents
#meta_employers <- out_employers$meta
#vocab_employers <- out_employers$vocab
#length(docs_employers) #6157 unique firm names after pre-processing
#length(vocab_employers) #1871 unique terms after pre-processing
#rm(out_employers,temp_employers)

## Step 2: Choose K topics based on coherence and exclusivity
#set.seed(210324)
#storage <- searchK(GScont.docs_employers, GScont.vocab_employers, 
#                   K = c(seq(10,18, by=1)),
#                   N=floor(0.1 * length(GScont.docs_employers)),
#                   data = GScont.meta_employers)
#K_models <- as.data.frame(storage$results)
#plot.searchK(storage)
## 12 topics chosen: perform relatively better on residuals, semantic coherence, exclusivity, and held-out likelihood

## Step 3: Estimate STM model with the selected K
#K <- 12
#set.seed(210324)
#cont_employers <- stm(docs_employers, vocab_employers, K=K, 
#                      data=meta_employers)
#checkResiduals(cont_employers, docs_employers, tol=0.01)
#checkBeta(cont_employers, tolerance = 0.01)

## Step 4: Hand-label topics
##inspect keywords associated with each topic
#sink("Employers_words.txt") 
#summary(cont_employers)
#sink()
##inspect representative firm names associated with each topic
#thoughtsEmployers <- findThoughts(cont_employers, 
#                                  texts=as.character(meta_employers$employers), 
#                                  n=15) 
#sink("Employers_docs.txt")
#print(thoughtsEmployers$docs[[4]])
#sink()


## Step 5: Esitmate topic prevalance ~ penalty on non-conformity
rm(list=ls())
#install.packages("stm")
sapply("stm", require, character.only=TRUE)
load("Data/stm_output.RData")

# Labels of the 12 topics (output of Step 4)
topic_employers <- c(
  "Pharmaceutical, hospital", 
  "Securities", 
  "Materials, metallurgical", 
  "Financial service outsourcing", 
  "Investment management", 
  "Real estate",  
  "Biotech, education", 
  "Construction",
  "Intelligent software",
  "University research lab",
  "Media, culture, sports",
  "IT and electronics R&D")

# Plotting Figure 3
set.seed(1227)
employer_penalty <- estimateEffect(1:12 ~ penalize_disloyal, cont_employers, 
                                   metadata=meta_employers, uncertainty="None")
summary(employer_penalty) 
#png("Log/Figure3.png", units="in", height=10, width=10, res=240)
pdf("Log/Figure3.pdf",width=12,height=10)
par(mar=c(5, 16, 1, 5) + 0.1)
plot(employer_penalty, covariate = "penalize_disloyal", 
     topics=c(9,7,11,4,12,6,3,8,10,5,1,2),
     model = cont_employers, method = "difference", cov.value1 = 1,
     cov.value2 = 0,
     xlab = "Difference in topic proportions",
     main = "", xlim = c(-0.03, 0.03), ylim=c(-0.2,12),
     labeltype = "custom",  custom.labels="",
     cex.lab=1.3,cex.axis=1.3)
par(las=1)
mtext(topic_employers[c(9,7,11,4,12,6,3,8,10,5,1,2)], line=1,
      at=12:1,side=2, adj=1, cex=1.3)
suppressWarnings(text(-0.033, -0.4, pos=4, expression(italic('Not penalize\nnon-conformity')), cex=1.2))
suppressWarnings(text(0.033, -0.4, pos=2, expression(italic('Penalize\nnon-conformity')), cex=1.2))
dev.off()

###############################
####### Figure 5 #############
###############################
rm(list=ls()) 
suppressMessages(library(lmtest))
suppressMessages(library(mfx))
suppressMessages(library(multiwayvcov))
d<-read.csv("Data/ResumeAudit.csv",header=TRUE, stringsAsFactors = FALSE,sep=",") 

# Effect of non-conformity vs. apolitical and conformity by the 4 cells
results<-matrix(NA, nrow=4,ncol=5)
results<-data.frame(results)
colnames(results)<-c("feature","estimate","se","low","upper")
results[,1]<-c("P_I", "NP_I","P_NI","NP_NI")

# State priority, Innovative firms
d$priority_innovation<-0
d$priority_innovation[d$Innovation==1 & d$State_Priority==1]<-1
model1 <- lm(callback ~ non_conformity*priority_innovation, data = d) 
vcovCL.model1<-cluster.vcov(model1, d$job_identifier) #robust s.e. clustered on vacancy level
coeftest(model1, vcov = vcovCL.model1)
results[results$feature=="P_I",c("estimate","se")]<-coeftest(model1, vcov = vcovCL.model1)[4,1:2]

# Not state prioirty, Innovative firms
d$Nonpriority_innovation<-0
d$Nonpriority_innovation[d$Innovation==1 & d$State_Priority==0]<-1
model1 <- lm(callback ~ non_conformity*Nonpriority_innovation, data = d) 
vcovCL.model1<-cluster.vcov(model1, d$job_identifier)
coeftest(model1, vcov = vcovCL.model1)
results[results$feature=="NP_I",c("estimate","se")]<-coeftest(model1, vcov = vcovCL.model1)[4,1:2]

# State priority, Non-Innovative firms
d$priority_Notinnovation<-0
d$priority_Notinnovation[d$Innovation==0 & d$State_Priority==1]<-1
model1 <- lm(callback ~ non_conformity*priority_Notinnovation, data = d) #base: loyal and apolitical
vcovCL.model1<-cluster.vcov(model1, d$job_identifier) #robust s.e. clustered on the job level
coeftest(model1, vcov = vcovCL.model1)
results[results$feature=="P_NI",c("estimate","se")]<-coeftest(model1, vcov = vcovCL.model1)[4,1:2]

# Not state prioirty, Non-innovative firms
d$Nonpriority_Noninnovation<-0
d$Nonpriority_Noninnovation[d$Innovation==0 & d$State_Priority==0]<-1
model1 <- lm(callback ~ non_conformity*Nonpriority_Noninnovation, data = d) #base: loyal and apolitical
vcovCL.model1<-cluster.vcov(model1, d$job_identifier) #robust s.e. clustered on the job level
coeftest(model1, vcov = vcovCL.model1)
results[results$feature=="NP_NI",c("estimate","se")]<-coeftest(model1, vcov = vcovCL.model1)[4,1:2]

# Plotting Figure 5
results$low <-results$estimate - 1.96*results$se
results$upper <-results$estimate + 1.96*results$se
results[,c(2,4,5)]<-(-1)*results[,c(2,4,5)]
max(results$low)
min(results$upper)
pdf("Log/Figure5.pdf",width=10,height=8)
par(mar=c(5, 19, 1, 1) + 0.1)
y<-c(0.2,0.15,0.1,0.05)
plot(results$estimate, y, xlim=c(-0.1,0.1), ylim=c(0,0.22),main="", yaxt="n", 
     ylab="", xlab="", pch=16, cex=1.5, cex.axis=1.5, cex.lab=1.5)
abline(v=0,lty=2)
for(i in 1:4){
  segments(results[i,4],y[i],results[i,5],y[i], lwd=2, lty=1)
}
mtext("Penalty on Non-conformity Compared to Rest of Sample", 
      side=1, line=3, at=0, cex=1.5)
suppressWarnings(text(-0.11, 0, pos=4, expression(italic('Less penalty on\nnon-conformity')), cex=1.5))
suppressWarnings(text(0.11, 0, pos=2, expression(italic('More penalty on\nnon-conformity')), cex=1.5))
features<-c("CCP priority, innovative", "Not CCP prioirty, innovative", 
            "CCP priority, not innovative", "Not CCP prioirty, not innovative")
par(las=1)
mtext(features[1], line=1, at=y[1],side=2, adj=1, cex=1.5)
mtext(features[2], line=1, at=y[2],side=2, adj=1, cex=1.5)
mtext(features[3], line=1, at=y[3],side=2, adj=1, cex=1.5)
mtext(features[4], line=1, at=y[4],side=2, adj=1, cex=1.5)
dev.off()


